library(DESeq2)
library(pheatmap)
library(dplyr)
library(dendextend)
library(ggplot2)
#Daniela File path
design_matrix<-read.table('/Users/danielaquijano/Documents/GitHub/Transcriptomics-Final-Project-/source_files/Experimental_Design_TG (1).csv',sep=',',header=TRUE)
head(design_matrix)
#Tasnim File path
#design_matrix<-read.table('/Users/tasnimtabassum/Documents/Transcriptomics_SP22/Experimental_Design_TG.csv',sep=',',header=TRUE)
#head(design_matrix)
rownames(design_matrix)<-design_matrix$Sample
design_matrix$Sample<-NULL
design_matrix
counts_matrix<-read.table("/Users/danielaquijano/Documents/GitHub/Transcriptomics-Final-Project-/Count_Tables/allcounts.csv",sep=',',header=TRUE)
counts_matrix
Because the numbers after the dot in the ensembl IDs represent versions of genes in certain annotations, we can remove these to more easily conduct our differential gene expression analysis.
counts_matrix$V1<-gsub("\\..*","",counts_matrix$V1)
counts_matrix
# remove the "V1" from col 1
rownames(counts_matrix)<-counts_matrix$V1
counts_matrix$V1<-NULL
#head(counts_matrix)
counts_matrix<-counts_matrix[,order(colnames(counts_matrix))]
counts_matrix
design_matrix<-design_matrix[order(rownames(design_matrix)),]
design_matrix
NA
design_matrix$Age = factor(design_matrix$Age)
design_matrix$Age
[1] eight eight two two eight eight two two two eight eight eight
[13] two two eight eight two two two two eight eight eight eight
[25] two two six six six six twelve twelve twelve twelve twelve twelve
[37] twelve twelve six six twelve twelve six six twelve twelve six six
[49] six six twelve twelve six six twelve twelve six six twelve twelve
[61] twelve twelve twelve twelve six six six six two two eight eight
Levels: eight six twelve two
dds <- DESeqDataSetFromMatrix(countData = counts_matrix,
colData = design_matrix,
design = ~ Age+Genotype)
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
dds
class: DESeqDataSet
dim: 46075 72
metadata(1): version
assays(1): counts
rownames(46075): ENSMUSG00000000001 ENSMUSG00000000003 ... N_noFeature N_unmapped
rowData names(0):
colnames(72): SRR8512301 SRR8512302 ... SRR8512439 SRR8512440
colData names(3): Model Genotype Age
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 13457 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
normalizedcounts.matrix <- counts(dds,normalized=T)
vst_dds <- vst(dds)
dists <- dist(t(assay(vst_dds)))
head(vst_dds)
class: DESeqTransform
dim: 6 72
metadata(1): version
assays(1): ''
rownames(6): ENSMUSG00000000001 ENSMUSG00000000028 ... ENSMUSG00000000049
ENSMUSG00000000056
rowData names(43): baseMean baseVar ... replace dispFit
colnames(72): SRR8512301 SRR8512302 ... SRR8512439 SRR8512440
colData names(5): Model Genotype Age sizeFactor replaceable
PCA_Genotype<-plotPCA(vst_dds,intgroup=c("Age"))+labs(title = "PCA of mice of different ages", color = "Group")+coord_fixed(ratio=3)
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
PCA_Genotype
resultsNames(dds)
[1] "Intercept" "Age_six_vs_eight" "Age_twelve_vs_eight"
[4] "Age_two_vs_eight" "Genotype_rtg4510_vs_J20" "Genotype_WT_vs_J20"
[7] "Genotype_WT_TG_vs_J20"
dds$Age
[1] eight eight two two eight eight two two two eight eight eight
[13] two two eight eight two two two two eight eight eight eight
[25] two two six six six six twelve twelve twelve twelve twelve twelve
[37] twelve twelve six six twelve twelve six six twelve twelve six six
[49] six six twelve twelve six six twelve twelve six six twelve twelve
[61] twelve twelve twelve twelve six six six six two two eight eight
Levels: eight six twelve two
#Compare 2 vs 8
res_1 <- results(dds, contrast = c("Age", "two", "eight"))
res1_ordered <- res_1[order(res_1$padj),]
head(res1_ordered)
log2 fold change (MLE): Age two vs eight
Wald test p-value: Age two vs eight
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000024501 1037.7531 0.574615 0.0533228 10.77617 4.46085e-27 8.33420e-23
ENSMUSG00000030577 10.9689 -4.378147 0.4088867 -10.70748 9.38804e-27 8.76984e-23
ENSMUSG00000026303 45.6887 -1.152385 0.1270487 -9.07042 1.18562e-19 7.38363e-16
ENSMUSG00000030124 162.9878 -1.526436 0.1688798 -9.03859 1.58703e-19 7.41263e-16
ENSMUSG00000046805 863.4728 -1.317521 0.1476032 -8.92610 4.41290e-19 1.64892e-15
ENSMUSG00000029816 69.7214 -1.694819 0.1920511 -8.82483 1.09623e-18 3.41349e-15
plotDispEsts(dds)
Install Mouse annotation library:
library(biomaRt) #For conversion of transcript IDs to gene ID
library(annotables) #to retrieve grcm38 annotation for mouse genome
library(org.Mm.eg.db) #Mouse genome annotation
library(DOSE)
library(pathview)
library(clusterProfiler)
library(AnnotationHub)
library(ensembldb)
library(tidyverse)
library(ggnewscale)
# mouse genome load
grcm38
# check that ensgene in our df is prsent in the mouse genome df
idx <- grcm38$ensgene %in% rownames(res1_ordered)
# head(idx)
# df with all the ids that are in our df from the mouse genome df
ids <- grcm38[idx, ]
# head(ids)
# remove duplicates
non_duplicates <- which(duplicated(ids$ensgene) == FALSE)
ids <- ids[non_duplicates, ]
#nrow(res1_ordered)
#rownames(res1_ordered)
# entrezID contains only the IDs that are also in our df
ensgeneID= grcm38[grcm38$ensgene %in% rownames(res1_ordered), ]
# entrezID contains only the IDs that are also in our df
entrezID= grcm38[grcm38$ensgene %in% rownames(res1_ordered), ]
# check nrow ensgeneID
head(ensgeneID)
# create a vector of only the ensgeneIDs
ensgene_ID_vector = c(ensgeneID[[1]])
head(ensgene_ID_vector)
[1] "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000031" "ENSMUSG00000000037"
[5] "ENSMUSG00000000049" "ENSMUSG00000000056"
# create a vector of only the entrezIDs
entrez_ID_vector = c(entrezID[[2]])
# create a vector of only the gene symbols
gene_symbols = subset(grcm38$symbol, grcm38$ensgene %in% rownames(res1_ordered))
# create new df that contains only the ensgeneIDs, lfc and padj
res2= data.frame(log2foldchange= subset(res1_ordered$log2FoldChange, grcm38$ensgene %in% rownames(res1_ordered)))
padj = subset(res1_ordered$padj, grcm38$ensgene %in% rownames(res1_ordered))
res2 = cbind(padj, res2)
res2 = cbind(ensgene_ID_vector, res2)
res2 = cbind(entrez_ID_vector, res2)
res2 = cbind(gene_symbols, res2)
# omit all "na" values
res2 = na.omit(res2)
res2 <- res2[order(res2$padj),]
head(res2)
## Significant genes is a vector of fold changes where the names are ENTREZ gene IDs. The background set is a vector of all the genes represented on the platform.
# bg entrez contains all the ensgene
allOEgenes = as.character(res2$ensgene_ID_vector)
# sig res entrez contains all the entrezIDs that have padj <0.05
head(res2)
sigOE <- subset(res2, padj< 0.05)
head(sigOE)
# vector of only lfc values
sigOE_genes = as.character(sigOE$ensgene_ID_vector)
head(sigOE_genes)
[1] "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000037" "ENSMUSG00000000049"
[5] "ENSMUSG00000000056" "ENSMUSG00000000058"
nrow(sigOE)
[1] 3062
## Run GO enrichment analysis
ego <- enrichGO(gene = sigOE_genes,
universe = allOEgenes,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "ALL",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE,
pool = TRUE)
## Output results from GO analysis to a table
cluster_summary <- data.frame(ego)
#gene_ratio = cluster_summary[order(cluster_summary$pvalue, decreasing = FALSE), ]
#head(gene_ratio)
cluster_summary
## Dotplot
dotplot(ego, showCategory=35)+theme(text = element_text(size = 1)) +scale_y_discrete(labels=function(x) str_wrap(x, width=40))+ggtitle('Enriched genes when comparing rtg4510 mice at 2 and 8 months')+ theme(plot.title = element_text(size=16))
Scale for 'y' is already present. Adding another scale for 'y', which will replace the
existing scale.
#barplot(ego, showCategory = 20)
options(ggrepel.max.overlaps = Inf)
## To color genes by log2 fold changes, we need to extract the log2 fold changes from our results table creating a named vector
OE_foldchanges <- sigOE$log2foldchange
names(OE_foldchanges) <- sigOE$gene_symbols
## Cnetplot details the genes associated with one or more terms - by default gives the top 1 significant term (by padj)
cnetplot(ego,
categorySize="pvalue",
showCategory = 5,
colorEdge = TRUE,
circular = FALSE,
node_label = "all",
cex_category = 1.5,
cex_gene = 0.75,
cex_label_category = 1.5,
cex_label_gene = 0.75,
shadowtext = "all")+ggtitle('Enriched genes when comparing rtg4510 mice at 2 and 8 months')
NA
NA
NA
NA
library(enrichplot)
ego2 = pairwise_termsim(ego)
emapplot(ego2, showCategory = 20, colorEdge = TRUE)+ggtitle('Enriched Genes when comparing J20 mice at 6 and 12 months of age')
# Set-up
#BiocManager::install("SPIA")
library(SPIA)
## Significant genes is a vector of fold changes where the names are ENTREZ gene IDs. The background set is a vector of all the genes represented on the platform.
# bg entrez contains all the entrezIDs
background_entrez <- res2$entrez_ID_vector
# sig res entrez contains all the entrezIDs that have padj <0.05
sig_res_entrez <- res2[which(res2$padj < 0.05), ]
# vector of only lfc values
sig_entrez <- sig_res_entrez$log2foldchange
head(sig_entrez)
[1] 0.574615 -1.152385 -1.317521 -1.694819 -1.351549 -1.217587
# adding entrezIDs as names for the sig entrez
names(sig_entrez) <- sig_res_entrez$entrez_ID_vector
head(sig_entrez)
14679 12544 107815 11818 67608 12390
0.574615 -1.152385 -1.317521 -1.694819 -1.351549 -1.217587
# remove dups
dups<-unique(names(sig_entrez[which(duplicated(names(sig_entrez)))]))
sig_entrez<-sig_entrez[!(names(sig_entrez) %in% dups)]
#de= as.vector(sig_entrez)
#de = sort(de, decreasing = FALSE)
# this step takes time
spia_result <- spia(de=sig_entrez, all=background_entrez, organism="mmu", plots=FALSE)
Done pathway 1 : RNA transport..
Done pathway 2 : RNA degradation..
Done pathway 3 : PPAR signaling pathway..
Done pathway 4 : Fanconi anemia pathway..
Done pathway 5 : MAPK signaling pathway..
Done pathway 6 : ErbB signaling pathway..
Done pathway 7 : Calcium signaling pathway..
Done pathway 8 : Cytokine-cytokine receptor int..
Done pathway 9 : Chemokine signaling pathway..
Done pathway 10 : NF-kappa B signaling pathway..
Done pathway 11 : Phosphatidylinositol signaling..
Done pathway 12 : Neuroactive ligand-receptor in..
Done pathway 13 : Cell cycle..
Done pathway 14 : Oocyte meiosis..
Done pathway 15 : p53 signaling pathway..
Done pathway 16 : Sulfur relay system..
Done pathway 17 : SNARE interactions in vesicula..
Done pathway 18 : Regulation of autophagy..
Done pathway 19 : Protein processing in endoplas..
Done pathway 20 : Lysosome..
Done pathway 21 : mTOR signaling pathway..
Done pathway 22 : Apoptosis..
Done pathway 23 : Vascular smooth muscle contrac..
Done pathway 24 : Wnt signaling pathway..
Done pathway 25 : Dorso-ventral axis formation..
Done pathway 26 : Notch signaling pathway..
Done pathway 27 : Hedgehog signaling pathway..
Done pathway 28 : TGF-beta signaling pathway..
Done pathway 29 : Axon guidance..
Done pathway 30 : VEGF signaling pathway..
Done pathway 31 : Osteoclast differentiation..
Done pathway 32 : Focal adhesion..
Done pathway 33 : ECM-receptor interaction..
Done pathway 34 : Cell adhesion molecules (CAMs)..
Done pathway 35 : Adherens junction..
Done pathway 36 : Tight junction..
Done pathway 37 : Gap junction..
Done pathway 38 : Complement and coagulation cas..
Done pathway 39 : Antigen processing and present..
Done pathway 40 : Toll-like receptor signaling p..
Done pathway 41 : NOD-like receptor signaling pa..
Done pathway 42 : RIG-I-like receptor signaling ..
Done pathway 43 : Cytosolic DNA-sensing pathway..
Done pathway 44 : Jak-STAT signaling pathway..
Done pathway 45 : Natural killer cell mediated c..
Done pathway 46 : T cell receptor signaling path..
Done pathway 47 : B cell receptor signaling path..
Done pathway 48 : Fc epsilon RI signaling pathwa..
Done pathway 49 : Fc gamma R-mediated phagocytos..
Done pathway 50 : Leukocyte transendothelial mig..
Done pathway 51 : Intestinal immune network for ..
Done pathway 52 : Circadian rhythm - mammal..
Done pathway 53 : Long-term potentiation..
Done pathway 54 : Neurotrophin signaling pathway..
Done pathway 55 : Retrograde endocannabinoid sig..
Done pathway 56 : Glutamatergic synapse..
Done pathway 57 : Cholinergic synapse..
Done pathway 58 : Serotonergic synapse..
Done pathway 59 : GABAergic synapse..
Done pathway 60 : Dopaminergic synapse..
Done pathway 61 : Long-term depression..
Done pathway 62 : Olfactory transduction..
Done pathway 63 : Taste transduction..
Done pathway 64 : Phototransduction..
Done pathway 65 : Regulation of actin cytoskelet..
Done pathway 66 : Insulin signaling pathway..
Done pathway 67 : GnRH signaling pathway..
Done pathway 68 : Progesterone-mediated oocyte m..
Done pathway 69 : Melanogenesis..
Done pathway 70 : Adipocytokine signaling pathwa..
Done pathway 71 : Type II diabetes mellitus..
Done pathway 72 : Type I diabetes mellitus..
Done pathway 73 : Maturity onset diabetes of the..
Done pathway 74 : Aldosterone-regulated sodium r..
Done pathway 75 : Endocrine and other factor-reg..
Done pathway 76 : Vasopressin-regulated water re..
Done pathway 77 : Salivary secretion..
Done pathway 78 : Gastric acid secretion..
Done pathway 79 : Pancreatic secretion..
Done pathway 80 : Carbohydrate digestion and abs..
Done pathway 81 : Bile secretion..
Done pathway 82 : Mineral absorption..
Done pathway 83 : Alzheimer's disease..
Done pathway 84 : Parkinson's disease..
Done pathway 85 : Amyotrophic lateral sclerosis ..
Done pathway 86 : Huntington's disease..
Done pathway 87 : Prion diseases..
Done pathway 88 : Cocaine addiction..
Done pathway 89 : Amphetamine addiction..
Done pathway 90 : Morphine addiction..
Done pathway 91 : Alcoholism..
Done pathway 92 : Bacterial invasion of epitheli..
Done pathway 93 : Salmonella infection..
Done pathway 94 : Pertussis..
Done pathway 95 : Legionellosis..
Done pathway 96 : Leishmaniasis..
Done pathway 97 : Chagas disease (American trypa..
Done pathway 98 : African trypanosomiasis..
Done pathway 99 : Malaria..
Done pathway 100 : Toxoplasmosis..
Done pathway 101 : Amoebiasis..
Done pathway 102 : Staphylococcus aureus infectio..
Done pathway 103 : Tuberculosis..
Done pathway 104 : Hepatitis C..
Done pathway 105 : Measles..
Done pathway 106 : Influenza A..
Done pathway 107 : HTLV-I infection..
Done pathway 108 : Herpes simplex infection..
Done pathway 109 : Epstein-Barr virus infection..
Done pathway 110 : Pathways in cancer..
Done pathway 111 : Transcriptional misregulation ..
Done pathway 112 : Viral carcinogenesis..
Done pathway 113 : Colorectal cancer..
Done pathway 114 : Renal cell carcinoma..
Done pathway 115 : Pancreatic cancer..
Done pathway 116 : Endometrial cancer..
Done pathway 117 : Glioma..
Done pathway 118 : Prostate cancer..
Done pathway 119 : Thyroid cancer..
Done pathway 120 : Basal cell carcinoma..
Done pathway 121 : Melanoma..
Done pathway 122 : Bladder cancer..
Done pathway 123 : Chronic myeloid leukemia..
Done pathway 124 : Acute myeloid leukemia..
Done pathway 125 : Small cell lung cancer..
Done pathway 126 : Non-small cell lung cancer..
Done pathway 127 : Asthma..
Done pathway 128 : Autoimmune thyroid disease..
Done pathway 129 : Systemic lupus erythematosus..
Done pathway 130 : Rheumatoid arthritis..
Done pathway 131 : Allograft rejection..
Done pathway 132 : Graft-versus-host disease..
Done pathway 133 : Arrhythmogenic right ventricul..
Done pathway 134 : Dilated cardiomyopathy..
Done pathway 135 : Viral myocarditis..
write.csv(spia_result, file = "spia_result_J20_age.csv")
# view one record at a time
subset(spia_result, ID == "04727")
head(res1_ordered)
log2 fold change (MLE): Age two vs eight
Wald test p-value: Age two vs eight
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000024501 1037.7531 0.574615 0.0533228 10.77617 4.46085e-27 8.33420e-23
ENSMUSG00000030577 10.9689 -4.378147 0.4088867 -10.70748 9.38804e-27 8.76984e-23
ENSMUSG00000026303 45.6887 -1.152385 0.1270487 -9.07042 1.18562e-19 7.38363e-16
ENSMUSG00000030124 162.9878 -1.526436 0.1688798 -9.03859 1.58703e-19 7.41263e-16
ENSMUSG00000046805 863.4728 -1.317521 0.1476032 -8.92610 4.41290e-19 1.64892e-15
ENSMUSG00000029816 69.7214 -1.694819 0.1920511 -8.82483 1.09623e-18 3.41349e-15
norm_counts_top_40 = normalizedcounts.matrix[row.names(head(res1_ordered, 40)), ]
nrow(norm_counts_top_40)
[1] 40
head(design_matrix)
annotation_columns<-design_matrix
row.names(annotation_columns) <- colnames(norm_counts_top_40)
library(pheatmap)
tiff("Heatmap_2,8_age.tiff", width = 7, height = 5, units = 'in', res = 300)
pheatmap(norm_counts_top_40, color=colorRampPalette(c("white", "lavender", "purple4"))(30), scale="row", cluster_cols = T, show_rownames = T,fontsize = 7,fontsize_row = 4, fontsize_col = 4,labels_row = rownames(dists),annotation_col =annotation_columns,main='Differentially Expressed Genes in rtg4510 mice at 2 and 8 months old' )
dev.off()
null device
1